In [None]:
import pytz
import datetime
import marimo as mo

india_timezone = pytz.timezone("Asia/Kolkata")
now = datetime.datetime.now(india_timezone)

curr = now.strftime("%Y-%m-%d, %I:%M:%S %p %Z")

mo.md(
    rf"""
# Week - 6

Common Instructions

- Load the wine dataset from sklearn. 
- Split the dataset into train and test set with **70:30 ratio** with `random_state = 1`
- Use this for all the questions that follow

**Submission Date:** `2025-11-02, 23:59 IST`

**Last Updated:** `{curr}`
"""
)

In [None]:
RANDOM_STATE = 1

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    VotingClassifier,
    BaggingClassifier,
    RandomForestClassifier,
    GradientBoostingClassifier,
)

In [None]:
wine_data = load_wine()

print(wine_data.DESCR)

In [None]:
X = pd.DataFrame(data=wine_data.data, columns=wine_data.feature_names)
y = wine_data.target

In [None]:
X.head()

In [None]:
X.info()
X.describe().T

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=RANDOM_STATE
)

## Question 1 - 2

Train a Logistic Regression model with the following parameters:

- max_iter = 10000
- C as 0.5
- random_state = 1
- solver as 'sag'

Compute the accuracy on the test data.

In [None]:
lr = LogisticRegression(
    max_iter=10000, C=0.5, random_state=RANDOM_STATE, solver="sag"
)

lr.fit(X_train, y_train)

### Question 1

What is the accuracy of the model on the test data? [Enter 3 decimal places]

In [None]:
_y_pred = lr.predict(X_test)

np.round(accuracy_score(y_test, _y_pred), 3)

### Question 2

How many iterations did the algorithm take to converge?

In [None]:
lr.n_iter_.sum()

## Question 3 - 4

Perform Hyperparameter tuning on a decision tree classifier with **random_state = 1** and **cross validation = 4**. The Hyperparameter tuning is to be done over the following parameters:

- criterion can be 'entropy' or 'gini'
- splitter can be 'random' or 'best'
- Minimum number of samples per leaf as [2,4,6,8,10]
- Maximum depth as [3,4,5,6]

Compute the accuracy on the test data.

In [None]:
dt_params_dict = {
    "criterion": ["entropy", "gini"],
    "splitter": ["random", "best"],
    "min_samples_leaf": [2, 4, 6, 8, 10],
    "max_depth": [3, 4, 5, 6],
}

In [None]:
dtc = DecisionTreeClassifier(random_state=RANDOM_STATE)

In [None]:
dt_grid_search = GridSearchCV(dtc, param_grid=dt_params_dict, cv=4)
dt_grid_search.fit(X_train, y_train)

### Question 3

What is the accuracy of the model on the test data? [Enter 3 decimal places]

In [None]:
_y_pred = dt_grid_search.predict(X_test)

np.round(accuracy_score(y_test, _y_pred), 3)

### Question 4

What is the value of best max_depth after training with GridSearchCV?

In [None]:
dt_grid_search.best_params_["max_depth"]

## Question 5 - 6

Perform Hyperparameter tuning on an adaboost classifier with **random_state = 1** and **cross validation = 4**. The Hyperparameter tuning is to be done over the following parameters:

- Number of estimators as [100, 500, 1000]
- Learning rate as [0.5, 1, 2]

Compute the accuracy on the test data.

In [None]:
ada_params_dict = {
    "n_estimators": [100, 500, 1000],
    "learning_rate": [0.5, 1, 2],
}

In [None]:
ada = AdaBoostClassifier(random_state=RANDOM_STATE)

In [None]:
ada_grid_search = GridSearchCV(estimator=ada, param_grid=ada_params_dict, cv=4)
ada_grid_search.fit(X_train, y_train)

### Question 5

What is the accuracy of the model on the test data? [Enter 3 decimal places]

In [None]:
_y_pred = ada_grid_search.predict(X_test)

np.round(accuracy_score(y_test, _y_pred), 3)

### Question 6

What is the value of best n_estimators after training with GridSearchCV?

In [None]:
ada_grid_search.best_params_["n_estimators"]

## Question 7

Train a voting classifier by making use of the following estimators:
- Bagging Classifier
- Random Forest Classifier
- GradientBoosting Classifier

with **random_state = 1** for each of the estimators.

Train the model on the training data and compute the score on the test data.

In [None]:
bc = BaggingClassifier(random_state=RANDOM_STATE)
rf = RandomForestClassifier(random_state=RANDOM_STATE)
gb = GradientBoostingClassifier(random_state=RANDOM_STATE)

models = [("bagging", bc), ("random_forest", rf), ("gradient_boostig", gb)]

In [None]:
voting = VotingClassifier(models)
voting.fit(X_train, y_train)

### Question 7

What is the accuracy of the model on the test data? [Enter 3 decimal places]

In [None]:
_y_pred = voting.predict(X_test)

np.round(accuracy_score(y_test, _y_pred), 3)